suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(pander)
library(plotly)
library(pvclust)
library(tidyverse)
library(tximport)
library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
hpal <- colorRampPalette(c("blue","white","red"))(100)
samples <- read_csv(here("doc/macrogen-samples.csv"),
col_types=cols(.default=col_factor()))
JH: I use the Expression Profile.gene.txt from StringTie and I select only the Feature_GID and the read_count columns of each sample.
I transform the column Feature_GID into the row names => this will be my ‘counts’ object
ND I added a filter to remove small RNA (“misc_RNA”, “rRNA”, “snRNA”, “snoRNA”,“tRNA” )
counts <- suppressWarnings(suppressMessages(read_tsv(here("data/macrogen/ALB_RNAseq_for_Nico/R_analysis_JH/salmon/Expression_Profile.GCF_006496715.gene.txt"),
col_names = TRUE, col_types = cols(.default=col_guess())) %>%
filter(Type %in% c("protein_coding","pseudogene","lncRNA","transcribed_pseudogene")) %>%
select(c("Feature_GID",ends_with("Read_Count"))) %>%
column_to_rownames("Feature_GID")))
Sanity check to ensure that the data is sorted according to the sample info
stopifnot(all(colnames(counts) == samples$Sample_ID))
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "23.6% percent (8931) of 37866 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
ggplot(dat,aes(x,y,fill=Virus)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())
ggplot(dat,aes(x,y,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())
ggplot(dat,aes(x,y,fill=Cells)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage shows a very high shoulder on the right side.
A lot of genes are lowly expressed. This is probably indicating too high a sequencing depth.
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 8931 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by the different meta-data.
There is no obvious deviation between samples
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Virus=samples$Virus[match(ind,samples$Sample_ID)]) %>%
mutate(Time=samples$Time[match(ind,samples$Sample_ID)]) %>%
mutate(Cells=samples$Cells[match(ind,samples$Sample_ID)])
ggplot(dat,aes(x=values,group=ind,col=Virus)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 292536 rows containing non-finite values (stat_density).
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 292536 rows containing non-finite values (stat_density).
ggplot(dat,aes(x=values,group=ind,col=Cells)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 292536 rows containing non-finite values (stat_density).
dir.create(here("data/analysis_macrogen/bioQA"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis_macrogen/bioQA/raw-unormalised-gene-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
## JH: I do not have txi so I use DESeqDataSetFromMatrix instead
## JH: I block the effect of the Cells in the design, as we do not want yet to model it in the DEG 1 to 4
dds <- DESeqDataSetFromMatrix(
counts,
colData = samples,
design = ~ Cells + Virus * Time
)
## converting counts to integer mode
save(dds,file=here("data/analysis_macrogen/bioQA/dds.rda"))
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is relatively uniform -15% / + 30% of the average
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| 9_Read_Count | 10_Read_Count | 11_Read_Count | 13_Read_Count | 14_Read_Count |
|---|---|---|---|---|
| 1.125 | 0.9863 | 0.9059 | 0.9279 | 0.996 |
| 15_Read_Count | 37_Read_Count | 39_Read_Count | 40_Read_Count | 41_Read_Count |
|---|---|---|---|---|
| 1.084 | 0.8832 | 1.003 | 0.9704 | 1.02 |
| 42_Read_Count | 43_Read_Count | 45_Read_Count | 46_Read_Count | 47_Read_Count |
|---|---|---|---|---|
| 0.9899 | 1.292 | 1.08 | 0.8196 | 0.9273 |
| 49_Read_Count | 50_Read_Count | 52_Read_Count |
|---|---|---|
| 1.046 | 1.251 | 0.861 |
boxplot(sizes, main="Sequencing libraries size factor")
Assess whether there might be a difference in library size linked to a given metadata
We should keep an eye on the Virus=yes samples as they seem to have a slightly deeper sequencing depth than the no counterpart.
boxplot(split(sizes,dds$Virus),las=2,
main="Sequencing libraries size factor by treatment with Virus")
boxplot(split(sizes,dds$Time),las=2,
main="Sequencing libraries size factor by Time")
boxplot(split(sizes,dds$Cells),las=2,
main="Sequencing libraries size factor by Cell liness")
It does not seem all so bad, a couple samples are outliers for the “yes” class.
plot(sizes,log10(colSums(counts(dds))),ylab="log10 raw depth",xlab="scaling factor",
col=rainbow(n=nlevels(dds$Virus))[as.integer(dds$Virus)],pch=19)
legend("bottomright",fill=rainbow(n=nlevels(dds$Virus)),
legend=levels(dds$Virus),cex=0.6)
For visualisation, in addition to the correction for the sequencing depth above, we want to correct for the mean-variance relationship that exists in RNA-Seq data, hence we apply a transformation to render the data homoscedastic
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately, impressively so even.
meanSdPlot(vst[rowSums(vst)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=3
An the number of possible combinations
nlevel=nlevels(dds$Virus) * nlevels(dds$Time) * nlevels(dds$Cells)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
A lot of the variance is explained by the first components, while all samples are needed to explain all the variance in the data
The inflection point arise early (4) after which the increase is linearly constant
All these are promising.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
## Warning: Removed 3 row(s) containing missing values (geom_path).
The first components separates the cell types. The second is the time within C6.36
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(dds)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Cells,text=Sample_ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Number of genes expressed per condition at different cutoffs
Here we try to assess whether there would be any link between sequencing depth and the variable of interest.
It would seem not to be the case or limited to highly expressed genes in the case of a Virus yes vs. no comparison
conds <- factor(paste(dds$Cells,dds$Time))
dev.null <- rangeSamplesSummary(counts=vst,
conditions=conds,
nrep=3)
dev.null <- rangeSamplesSummary(counts=vst,
conditions=dds$Virus,
nrep=6)
Filter for noise
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 3 y values <= 0 omitted from
## logarithmic plot
vst.cutoff <- 1
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = paste(conds,dds$Virus),
col=hpal)
There are a few interesting things here. U4.4 further separates based on virus presence.
C6.36 does not at 24h and neither does it at 6h unless there would be a sample swap…
plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=paste(conds,dds$Virus))
Done to assess the previous dendrogram’s reproducibility
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
method.hclust = "ward.D2",
nboot = 1000, parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
There is little doubt about the clustering, the au score is always close to the maximum (100 for a 0-100 range)
plot(hm.pvclust, labels = conds)
pvrect(hm.pvclust)
The data is of good quality. It might have been sequenced too deeply.
The data might not satisfy the expected study design, as the Virus effect is minimal and only appears in the U4.4 cell type.
A putative sample swap for the C6.36 cells at 6h could be investigated, but it might just be a random effect due to the little variance effect created by the Virus variable overall.
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.58.0 tximport_1.18.0
## [3] forcats_0.5.1 stringr_1.4.0
## [5] dplyr_1.0.6 purrr_0.3.4
## [7] readr_1.4.0 tidyr_1.1.3
## [9] tibble_3.1.1 tidyverse_1.3.1
## [11] pvclust_2.2-0 plotly_4.9.4
## [13] pander_0.6.3 hyperSpec_0.99-20201127
## [15] xml2_1.3.2 ggplot2_3.3.3
## [17] lattice_0.20-41 here_1.0.1
## [19] gplots_3.1.1 DESeq2_1.30.1
## [21] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [23] MatrixGenerics_1.2.1 matrixStats_0.59.0
## [25] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
## [27] IRanges_2.24.1 S4Vectors_0.28.1
## [29] BiocGenerics_0.36.1 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-1 ellipsis_0.3.2 rprojroot_2.0.2
## [4] XVector_0.30.0 fs_1.5.0 rstudioapi_0.13
## [7] hexbin_1.28.2 farver_2.1.0 affyio_1.60.0
## [10] bit64_4.0.5 AnnotationDbi_1.52.0 fansi_0.5.0
## [13] lubridate_1.7.10 splines_4.0.3 cachem_1.0.5
## [16] geneplotter_1.68.0 knitr_1.33 jsonlite_1.7.2
## [19] broom_0.7.6 annotate_1.68.0 dbplyr_2.1.1
## [22] png_0.1-7 BiocManager_1.30.15 compiler_4.0.3
## [25] httr_1.4.2 backports_1.2.1 assertthat_0.2.1
## [28] Matrix_1.3-3 fastmap_1.1.0 lazyeval_0.2.2
## [31] limma_3.46.0 cli_2.5.0 htmltools_0.5.1.1
## [34] tools_4.0.3 affy_1.68.0 gtable_0.3.0
## [37] glue_1.4.2 GenomeInfoDbData_1.2.4 Rcpp_1.0.6
## [40] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
## [43] preprocessCore_1.52.1 crosstalk_1.1.1 xfun_0.22
## [46] rvest_1.0.0 testthat_3.0.2 lifecycle_1.0.0
## [49] gtools_3.9.2 XML_3.99-0.6 zlibbioc_1.36.0
## [52] scales_1.1.1 hms_1.1.0 RColorBrewer_1.1-2
## [55] yaml_2.2.1 memoise_2.0.0 sass_0.4.0
## [58] latticeExtra_0.6-29 stringi_1.6.2 RSQLite_2.2.7
## [61] highr_0.9 genefilter_1.72.1 caTools_1.18.2
## [64] BiocParallel_1.24.1 rlang_0.4.11 pkgconfig_2.0.3
## [67] bitops_1.0-7 evaluate_0.14 labeling_0.4.2
## [70] htmlwidgets_1.5.3 bit_4.0.4 tidyselect_1.1.1
## [73] magrittr_2.0.1 R6_2.5.0 generics_0.1.0
## [76] DelayedArray_0.16.3 DBI_1.1.1 pillar_1.6.1
## [79] haven_2.4.1 withr_2.4.2 survival_3.2-10
## [82] RCurl_1.98-1.3 modelr_0.1.8 crayon_1.4.1
## [85] KernSmooth_2.23-18 utf8_1.2.1 rmarkdown_2.8
## [88] jpeg_0.1-8.1 locfit_1.5-9.4 readxl_1.3.1
## [91] blob_1.2.1 reprex_2.0.0 digest_0.6.27
## [94] xtable_1.8-4 munsell_0.5.0 viridisLite_0.4.0
## [97] bslib_0.2.5.1